require(knitr)
require(dplyr)
require(ggplot2)
require(kml)
load("Donnees/table.appr.RData")
don <- table.appr[,3:26]

On regarde l’évolution du rapport variance intra / variance totale en fonction de k

res=vector("numeric", 19)
for(k in 2:20){
  kmeans.k=kmeans(don, k)
  res[k-1]=kmeans.k$tot.withinss/kmeans.k$totss
}
## Warning: did not converge in 10 iterations
plot(2:20, res, type="b")

On commence par faire un k-means avec 5, 7 et 10 centres pour voir ce qui se passe

gp5 <- kmeans(don,5)
gp7 <- kmeans(don,7)
gp10 <- kmeans(don,10)

On trace les centres des classes ainsi que 10 station.jour pour chaque classe pris au hasard

table.appr$gp5_class <- gp5$cluster
table.appr$gp7_class <- gp7$cluster
table.appr$gp10_class <- gp10$cluster
par(mfrow=c(2,3))
for(i in 1:5){
  plot(0:23, gp5$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
  df=subset(table.appr, gp5_class == i)
  ech=sample(1:nrow(df), 10)
  df=df[ech,]
  for(j in 1:nrow(df)){
    lines(0:23, df[j,3:26], col="grey")
  }
}
par(mfrow=c(2,4))

for(i in 1:7){
  plot(0:23, gp7$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
  df=subset(table.appr, gp7_class == i)
  ech=sample(1:nrow(df), 10)
  df=df[ech,]
  for(j in 1:nrow(df)){
    lines(0:23, df[j,3:26], col="grey")
  }
}
par(mfrow=c(2,5))

for(i in 1:10){
  plot(0:23, gp10$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
  df=subset(table.appr, gp10_class == i)
  ech=sample(1:nrow(df), 10)
  df=df[ech,]
  for(j in 1:nrow(df)){
    lines(0:23, df[j,3:26], col="grey")
  }
}

par(mfrow=c(1,1))

On essaye le package kml. Choix du nombre de classes

ech <- table.appr[sample(1:nrow(table.appr), 1000),]
donLD <- clusterLongData(traj=ech[,3:26], idAll=paste0(ech$number, " - ", ech$download_date_trunc))
kml(donLD,nbClusters=2:6,nbRedrawing=20,toPlot="criterion")
x11(type = "Xlib")
choice(donLD, typeGraph = "bmp")

On choisit 4 classes.

donLD <- clusterLongData(traj=table.appr[,3:26], idAll=paste0(table.appr$number, " - ", table.appr$download_date_trunc))
kml(donLD,nbClusters=4,nbRedrawing=20,toPlot="none")
klm4 <- donLD
save(klm4, file="Donnees/klm4.RData")
load("Donnees/klm4.RData")
klm.clusters <- getClusters(klm4, 4)
levels(klm.clusters) = seq(1, 4, 1)
table.appr$klm4 <- klm.clusters
klm4.mean <- calculTrajMean(table.appr[,3:26], klm.clusters)
par(mfrow=c(2,2))
for(i in 1:4){
  plot(0:23, klm4.mean[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
  df=subset(table.appr, klm4 == i)
  ech=sample(1:nrow(df), 10)
  df=df[ech,]
  for(j in 1:nrow(df)){
    lines(0:23, df[j,3:26], col="grey")
  }
}

par(mfrow=c(1,1))